Table of Contents

  • 1  Loading and preparing data
    • 1.1  Loading original participant data (NZ)
      • 1.1.1  Melt dataframe to long format
    • 1.2  Loading replication participant data (US)
    • 1.3  Loading 2nd replication data (US, with reading measures)
      • 1.3.1  Compute reading subscales
    • 1.4  Retrieving vectors for words and dimension word pairs
  • 2  Ranking color-semantic associations in word embeddings
    • 2.1  Using single dimension words
    • 2.2  Using dimension axes (word pair contrasts), with nearest neighbor (cosine) method
  • 3  Creating datasets for statistical models
    • 3.1  Merging data and predictors
      • 3.1.1  Merge data
      • 3.1.2  Add word frequency (Van Paridon & Thompson, 2020)
      • 3.1.3  Add concreteness (Brysbaert et al., 2014)
      • 3.1.4  Add Small World of Words associations (De Deyne et al., 2018)
      • 3.1.5  Add cosine distances (Mikolov et al., 2013)
      • 3.1.6  Common Crawl
      • 3.1.7  Subtitles
    • 3.2  COCA
    • 3.3  Filtered COCA
    • 3.4  No neighbors COCA
    • 3.5  No names COCA
    • 3.6  Correlations between predictors
      • 3.6.1  Correlations between predictors in original data (NZ)
      • 3.6.2  Correlations between predictors in 2nd replication data (US)
      • 3.6.3  Correlations between predictors in full dataset
    • 3.7  Standardize predictors and write to file
  • 4  Nameability of color-dimension associations
      • 4.0.1  Exporting names generated by participants for use in training corpus filtering
      • 4.0.2  Correlating COCA-fiction cosine similarities to nameability measures
      • 4.0.3  Correlating group-averaged human ratings to nameability measure differentials.
      • 4.0.4  Correlation of group-averaged split-inverse ratings with nameability measures
  • 5  Extracting non-color nearest neighbors for each dimension
  • 6  More figures
    • 6.1  Mean color ratings on each dimension
    • 6.2  Scatterplot with connected points
  • 7  Convert notebook to html
In [3]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display, display_markdown

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import subprocess as sp
import numpy as np
import pandas as pd
import seaborn as sns
import arviz as az
import bambi
import copy
import warnings

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 8]
plt.rcParams['figure.dpi'] = 300

from scipy.stats import pearsonr, spearmanr
from itertools import chain
from collections import Counter

from subs2vec.utensils import log_timer
from subs2vec.vecs import Vectors
from subs2vec.neighbors import compute_nn

def display_md(md, **kwargs):
    return display_markdown(md, raw=True, **kwargs)

def convert_notebook(title, output='html'):
    convert = sp.run(f'jupyter nbconvert {title}.ipynb --to {output} --output {title}.{output}'.split(' '))
    if convert.returncode == 0:
        display_md(f'Jupyter notebook `{title}` converted successfully.')
    else:
        display_md(f'Error: encountered problem converting Jupyter notebook `{title}`')

def download(fname):
    dl = sp.run(f'wget {fname}'.split(' '))
    if dl.returncode == 0:
        display_md(f'Download of `{fname}` succesful.')
    else:
        display_md(f'Download of `{fname}` failed.')
        
@log_timer
def filter_vecs(vecs, filter_words):
    filtered_vecs = copy.deepcopy(vecs)
    filtered_vecs.vectors = filtered_vecs.vectors[np.isin(filtered_vecs.words, filter_words)]
    filtered_vecs.words = filtered_vecs.words[np.isin(filtered_vecs.words, filter_words)]
    filtered_vecs.n = len(filtered_vecs.words)
    display_md(f'Filtered {vecs.n} vectors, {filtered_vecs.n} remaining.')
    return filtered_vecs

def norm(x):
    return x / np.linalg.norm(x, 2)

sns.set(style='whitegrid')
pd.options.mode.chained_assignment = None
WARNING (pytensor.tensor.blas): Using NumPy C-API based implementation for BLAS functions.

Loading and preparing data¶

Loading original participant data (NZ)¶

In [2]:
df = pd.read_csv('data/saysani_data.tsv', sep='\t')
display(df)
participant white red orange yellow green blue purple brown black dimension group pp_id
0 1 1 7 7 5 1 1 1 3 7 cold-hot sighted sighted_1
1 1 7 1 4 2 3 3 6 6 7 ripe-unripe sighted sighted_1
2 1 1 5 6 7 4 2 3 7 6 new-old sighted sighted_1
3 1 1 7 2 1 4 2 3 5 7 submissive-aggressive sighted sighted_1
4 1 1 7 6 1 2 2 5 3 5 selfless-jealous sighted sighted_1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
539 12 1 2 1 2 2 3 2 3 4 soft-hard blind blind_12
540 12 4 3 3 4 2 2 3 2 5 light-heavy blind blind_12
541 12 2 4 2 1 2 1 2 3 2 relaxed-tense blind blind_12
542 12 4 2 1 1 1 3 2 3 5 alive-dead blind blind_12
543 12 6 7 4 3 4 4 1 2 5 fast-slow blind blind_12

544 rows × 13 columns

Melt dataframe to long format¶

In [3]:
# these are the colors in the data
colors = ['white', 'red', 'orange', 'yellow', 'green', 'blue', 'purple', 'brown', 'black']

# melt
df_orig = df.melt(
    id_vars=['group', 'dimension', 'pp_id'],
    value_vars=colors,
    var_name='color',
    value_name='rating',
)

# pull out dimension words
dimension_labels = df_orig['dimension'].unique()
dimension_pairs = [pair.split('-') for pair in dimension_labels]
dimensions = list(chain(*dimension_pairs))

# add experiment and self vs. other variables for when we add the replication experiment later
df_orig['experiment'] = 'original'
df_orig['self_vs_other'] = 'self'

display(df_orig)
group dimension pp_id color rating experiment self_vs_other
0 sighted cold-hot sighted_1 white 1 original self
1 sighted ripe-unripe sighted_1 white 7 original self
2 sighted new-old sighted_1 white 1 original self
3 sighted submissive-aggressive sighted_1 white 1 original self
4 sighted selfless-jealous sighted_1 white 1 original self
... ... ... ... ... ... ... ...
4891 blind soft-hard blind_12 black 4 original self
4892 blind light-heavy blind_12 black 5 original self
4893 blind relaxed-tense blind_12 black 2 original self
4894 blind alive-dead blind_12 black 5 original self
4895 blind fast-slow blind_12 black 5 original self

4896 rows × 7 columns

Loading replication participant data (US)¶

In [4]:
df_rep = pd.read_csv('data/replication1_data.csv')

# little bit of data munging, drop test participant and catch trials
df_rep = df_rep[(df_rep['pp_id'] != 3) & (df_rep['question_type'] != 'catch')]
df_rep = df_rep.drop(columns=['question_type', 'prompt_pre_1'])

# melt to long format
df_rep = df_rep.melt(
    id_vars=['dimension', 'color', 'pp_id'],
    value_vars=['value', 'others_choice'],
    var_name='self_vs_other',
    value_name='rating',
)

# more data munging
df_rep['pp_id'] = 'sighted_' + df_rep['pp_id'].astype(str)
df_rep['self_vs_other'] = df_rep['self_vs_other'].replace({'value': 'self', 'others_choice': 'other'})
df_rep['group'] = 'sighted'
df_rep['experiment'] = 'replication_1'

# there is a weird typo in one of the dimensions (?), so let's correct that here as well
df_rep['dimension'] = df_rep['dimension'].replace({'like-dis...like': 'like-dislike'})

display(df_rep)
dimension color pp_id self_vs_other rating group experiment
0 clean-dirty yellow sighted_69819 self 5 sighted replication_1
1 soft-hard yellow sighted_69819 self 2 sighted replication_1
2 ripe-unripe yellow sighted_69819 self 1 sighted replication_1
3 selfless-jealous yellow sighted_69819 self 5 sighted replication_1
4 high-low yellow sighted_69819 self 1 sighted replication_1
... ... ... ... ... ... ... ...
9567 like-dislike orange sighted_69785 other 4 sighted replication_1
9568 new-old orange sighted_69785 other 4 sighted replication_1
9569 clean-dirty orange sighted_69785 other 5 sighted replication_1
9570 relaxed-tense orange sighted_69785 other 5 sighted replication_1
9571 active-passive orange sighted_69785 other 3 sighted replication_1

9572 rows × 7 columns

Loading 2nd replication data (US, with reading measures)¶

In [5]:
df_read = pd.read_csv('data/replication2_data_with_reading.csv').drop(columns=['Unnamed: 0', 'X'])

display(df_read)
dimension group subj_id color value question_type others_choice art fiction nonfiction ... Q9_17 Q9_18 Q9_19 Q9_20 Q9_21 composite_read upper_art upper_fiction upper_nonfiction upper_read_motivation
0 cold-hot replication-sighted 69212 brown 4 semantic_diff 4 3.0 0.0 1.0 ... 2.0 2.0 2.0 2.0 2.0 2.000000 0.0 0.0 1.0 1.0
1 ripe-unripe replication-sighted 69212 brown 7 semantic_diff 6 3.0 0.0 1.0 ... 2.0 2.0 2.0 2.0 2.0 2.000000 0.0 0.0 1.0 1.0
2 new-old replication-sighted 69212 brown 6 semantic_diff 6 3.0 0.0 1.0 ... 2.0 2.0 2.0 2.0 2.0 2.000000 0.0 0.0 1.0 1.0
3 submissive-aggressive replication-sighted 69212 brown 2 semantic_diff 2 3.0 0.0 1.0 ... 2.0 2.0 2.0 2.0 2.0 2.000000 0.0 0.0 1.0 1.0
4 selfless-jealous replication-sighted 69212 brown 5 semantic_diff 4 3.0 0.0 1.0 ... 2.0 2.0 2.0 2.0 2.0 2.000000 0.0 0.0 1.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14251 light-heavy replication-sighted 68129 red 6 semantic_diff 5 11.0 0.0 1.0 ... 1.0 0.0 -1.0 0.0 1.0 -0.555556 1.0 0.0 1.0 0.0
14252 relaxed-tense replication-sighted 68129 red 6 semantic_diff 5 11.0 0.0 1.0 ... 1.0 0.0 -1.0 0.0 1.0 -0.555556 1.0 0.0 1.0 0.0
14253 alive-dead replication-sighted 68129 red 7 semantic_diff 6 11.0 0.0 1.0 ... 1.0 0.0 -1.0 0.0 1.0 -0.555556 1.0 0.0 1.0 0.0
14254 fast-slow replication-sighted 68129 red 1 semantic_diff 3 11.0 0.0 1.0 ... 1.0 0.0 -1.0 0.0 1.0 -0.555556 1.0 0.0 1.0 0.0
14255 high-low replication-sighted 68129 red 1 semantic_diff 2 11.0 0.0 1.0 ... 1.0 0.0 -1.0 0.0 1.0 -0.555556 1.0 0.0 1.0 0.0

14256 rows × 36 columns

Compute reading subscales¶

In [6]:
df_read['reading_motivation'] = df_read.apply(
    lambda x: (0
    + x['Q9_1']
    + x['Q9_2']
    + x['Q9_3']
    + x['Q9_4']
    + x['Q9_5']
    + x['Q9_6']
    + x['Q9_7']
    + x['Q9_8']
    + x['Q9_9']
    + x['Q9_10']
    + x['Q9_11']
    + x['Q9_12']
    + x['Q9_13']
    + x['Q9_14'] * -1
    + x['Q9_15']
    + x['Q9_16']
    + x['Q9_17'] * -1
    + x['Q9_18']
    + x['Q9_19']
    + x['Q9_20']
    + x['Q9_21']) / 21,
axis=1)

df_read['reading_part_of_self'] = df_read.apply(
    lambda x: (0
    + x['Q9_2']
    + x['Q9_3']
    + x['Q9_4']
    + x['Q9_5']
    + x['Q9_6']
    + x['Q9_9']
    + x['Q9_10']
    + x['Q9_11']) / 8,
axis=1)

df_read['reading_efficacy'] = df_read.apply(
    lambda x: (0
    + x['Q9_1']
    + x['Q9_14'] * -1
    + x['Q9_16']
    + x['Q9_17'] * -1
    + x['Q9_19']
    + x['Q9_20']) / 6,
axis=1)

df_read['reading_recognition'] = df_read.apply(
    lambda x: (0
    + x['Q9_12']
    + x['Q9_13']
    + x['Q9_15']) / 3,
axis=1)

df_read['reading_other_realms'] = df_read.apply(
    lambda x: (0
    + x['Q9_7']
    + x['Q9_8']
    + x['Q9_18']
    + x['Q9_21']) / 4,
axis=1)
In [7]:
# rename participant id column to match earlier datasets
df_read = df_read.rename(columns={'subj_id': 'pp_id'})

# melt to long format
df_read = df_read.melt(
    id_vars=['dimension', 'color', 'pp_id', 'art', 'fiction', 'nonfiction', 'reading_motivation',
             'reading_part_of_self', 'reading_efficacy', 'reading_recognition', 'reading_other_realms'],
    value_vars=['value', 'others_choice'],
    var_name='self_vs_other',
    value_name='rating',
)

# more data munging
df_read['pp_id'] = 'sighted_' + df_read['pp_id'].astype(str)
df_read['self_vs_other'] = df_read['self_vs_other'].replace({'value': 'self', 'others_choice': 'other'})
df_read['group'] = 'sighted'
df_read['experiment'] = 'replication_2'

display(df_read)
dimension color pp_id art fiction nonfiction reading_motivation reading_part_of_self reading_efficacy reading_recognition reading_other_realms self_vs_other rating group experiment
0 cold-hot brown sighted_69212 3.0 0.0 1.0 1.619048 2.000 0.666667 2.0 2.00 self 4 sighted replication_2
1 ripe-unripe brown sighted_69212 3.0 0.0 1.0 1.619048 2.000 0.666667 2.0 2.00 self 7 sighted replication_2
2 new-old brown sighted_69212 3.0 0.0 1.0 1.619048 2.000 0.666667 2.0 2.00 self 6 sighted replication_2
3 submissive-aggressive brown sighted_69212 3.0 0.0 1.0 1.619048 2.000 0.666667 2.0 2.00 self 2 sighted replication_2
4 selfless-jealous brown sighted_69212 3.0 0.0 1.0 1.619048 2.000 0.666667 2.0 2.00 self 5 sighted replication_2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
28507 light-heavy red sighted_68129 11.0 0.0 1.0 -0.571429 -0.375 -0.833333 -1.0 -0.25 other 5 sighted replication_2
28508 relaxed-tense red sighted_68129 11.0 0.0 1.0 -0.571429 -0.375 -0.833333 -1.0 -0.25 other 5 sighted replication_2
28509 alive-dead red sighted_68129 11.0 0.0 1.0 -0.571429 -0.375 -0.833333 -1.0 -0.25 other 6 sighted replication_2
28510 fast-slow red sighted_68129 11.0 0.0 1.0 -0.571429 -0.375 -0.833333 -1.0 -0.25 other 3 sighted replication_2
28511 high-low red sighted_68129 11.0 0.0 1.0 -0.571429 -0.375 -0.833333 -1.0 -0.25 other 2 sighted replication_2

28512 rows × 15 columns

In [8]:
df_read.describe()
Out[8]:
art fiction nonfiction reading_motivation reading_part_of_self reading_efficacy reading_recognition reading_other_realms rating
count 27864.000000 27864.000000 27864.000000 27864.000000 27864.000000 27864.000000 27864.000000 27864.000000 28512.000000
mean 7.616279 0.593023 0.755814 -0.107973 -0.280523 0.203488 -0.616279 0.151163 3.693147
std 6.612596 0.854251 0.987569 0.646210 0.879559 0.646210 0.910077 0.759909 1.424941
min -5.000000 0.000000 0.000000 -1.619048 -2.000000 -1.000000 -2.000000 -2.000000 1.000000
25% 3.000000 0.000000 0.000000 -0.571429 -1.000000 -0.333333 -1.333333 -0.500000 3.000000
50% 6.000000 0.000000 0.000000 -0.119048 -0.375000 0.166667 -0.666667 0.250000 4.000000
75% 10.000000 1.000000 1.000000 0.285714 0.250000 0.666667 0.000000 0.500000 5.000000
max 26.000000 4.000000 4.000000 1.619048 2.000000 2.000000 2.000000 2.000000 7.000000
In [10]:
corrs = df_read[['art', 'fiction', 'nonfiction', 'reading_motivation', 'reading_part_of_self',
                 'reading_efficacy', 'reading_recognition', 'reading_other_realms']].corr().round(2)
mask = np.zeros_like(corrs)
mask[np.triu_indices_from(mask)] = True
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True, mask=mask)
g.set_yticklabels(g.get_yticklabels(), rotation=0);
No description has been provided for this image
In [11]:
g = sns.histplot(x='art', data=df_read)
No description has been provided for this image

Retrieving vectors for words and dimension word pairs¶

In [1]:
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs = filter_vecs(vecs, np.array(colors + dimensions))
vecs_dict = vecs.as_dict()

color_vecs = filter_vecs(vecs, np.array(colors))
dimension_vecs = filter_vecs(vecs, np.array(dimensions))

dimension_pair_vecs = np.vstack([norm(vecs_dict[pair[0]] - vecs_dict[pair[1]]) for pair in dimension_pairs])

Ranking color-semantic associations in word embeddings¶

Using single dimension words¶

In [12]:
dimension_neighbors = compute_nn(color_vecs, dimension_vecs.vectors, dimension_vecs.words, num_neighbors=9, whole_matrix=True)
dimension_neighbors = dimension_neighbors.drop(columns=[
    'neighbor -1',
    'neighbor -2',
    'neighbor -3',
    'neighbor -4',
    'neighbor -5',
    'neighbor -6',
    'neighbor -7',
    'neighbor -8',
    'neighbor -9'
]).rename(columns={'target': 'dimension'})
display(dimension_neighbors)
[INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.000 seconds
[INFO] computing analogies using whole matrix additive method
[INFO] <function compute_nn at 0x16c553250> ran in 0.001 seconds
dimension neighbor 0 neighbor 1 neighbor 2 neighbor 3 neighbor 4 neighbor 5 neighbor 6 neighbor 7 neighbor 8
0 like white black yellow orange green purple blue brown red
1 old white brown black yellow orange green blue red purple
2 new black white green yellow red blue purple orange brown
3 light yellow orange blue green red purple white brown black
4 hard white brown black red orange green yellow purple blue
5 dead black white brown red green purple yellow orange blue
6 cold blue black white green brown red yellow purple orange
7 happy white orange brown yellow purple red green blue black
8 hot red yellow black orange purple white blue green brown
9 heavy black purple red blue brown green yellow orange white
10 fast red white yellow black blue green orange brown purple
11 soft brown green yellow purple orange red white blue black
12 clean white blue black brown yellow green red orange purple
13 slow red yellow purple blue brown black green orange white
14 angry orange red purple black white yellow blue brown green
15 alive green brown orange red yellow black blue purple white
16 sad brown red green black purple yellow blue orange white
17 fresh green red white blue yellow brown black purple orange
18 calm blue green white brown purple red black yellow orange
19 dirty brown yellow blue white red black orange green purple
20 dull brown green red blue yellow orange purple black white
21 relaxed blue white green yellow red purple brown orange black
22 jealous purple red black orange white yellow green blue brown
23 tense white black blue red brown green orange yellow purple
24 exciting green orange purple blue black red white brown yellow
25 active orange black green white brown red blue purple yellow
26 ripe orange green purple yellow red brown blue black white
27 aggressive orange yellow black white brown red green blue purple
28 stale brown orange yellow white green red purple blue black
29 dislike purple brown black orange green yellow red white blue
30 passive black white blue green red brown purple orange yellow
31 selfless black white brown blue orange purple red green yellow
32 submissive white brown black purple green blue orange yellow red
33 unripe orange purple yellow red brown green black blue white

Using dimension axes (word pair contrasts), with nearest neighbor (cosine) method¶

In [13]:
dimension_neighbors = compute_nn(color_vecs, dimension_pair_vecs, np.array(dimension_labels), num_neighbors=9, whole_matrix=True)
dimension_neighbors = dimension_neighbors.drop(columns=[
    'neighbor -1',
    'neighbor -2',
    'neighbor -3',
    'neighbor -4',
    'neighbor -5',
    'neighbor -6',
    'neighbor -7',
    'neighbor -8',
    'neighbor -9'
]).rename(columns={'target': 'dimension'})
display(dimension_neighbors)
[INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.000 seconds
[INFO] computing analogies using whole matrix additive method
[INFO] <function compute_nn at 0x16c553250> ran in 0.001 seconds
dimension neighbor 0 neighbor 1 neighbor 2 neighbor 3 neighbor 4 neighbor 5 neighbor 6 neighbor 7 neighbor 8
0 cold-hot blue green brown white black purple yellow red orange
1 ripe-unripe green orange purple red brown yellow blue black white
2 new-old green red purple black yellow blue white orange brown
3 submissive-aggressive purple white brown black blue green red yellow orange
4 selfless-jealous brown white black blue orange green yellow purple red
5 active-passive orange green brown red yellow purple black blue white
6 like-dislike white black yellow orange blue green red brown purple
7 clean-dirty white blue green black orange purple red yellow brown
8 fresh-stale green blue red white black purple yellow orange brown
9 calm-angry blue green brown white yellow black purple red orange
10 happy-sad white orange yellow purple blue green red black brown
11 exciting-dull orange white purple black blue red green yellow brown
12 soft-hard green purple yellow blue orange brown red white black
13 light-heavy yellow orange blue white green red purple brown black
14 relaxed-tense purple yellow orange blue green white brown red black
15 alive-dead green orange yellow blue red brown purple white black
16 fast-slow white black green orange red blue yellow brown purple

Creating datasets for statistical models¶

Merging data and predictors¶

Merge data¶

In [14]:
df_joint = pd.concat([df_orig, df_rep, df_read]).reset_index()
display(df_joint)
index group dimension pp_id color rating experiment self_vs_other art fiction nonfiction reading_motivation reading_part_of_self reading_efficacy reading_recognition reading_other_realms
0 0 sighted cold-hot sighted_1 white 1 original self NaN NaN NaN NaN NaN NaN NaN NaN
1 1 sighted ripe-unripe sighted_1 white 7 original self NaN NaN NaN NaN NaN NaN NaN NaN
2 2 sighted new-old sighted_1 white 1 original self NaN NaN NaN NaN NaN NaN NaN NaN
3 3 sighted submissive-aggressive sighted_1 white 1 original self NaN NaN NaN NaN NaN NaN NaN NaN
4 4 sighted selfless-jealous sighted_1 white 1 original self NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
42975 28507 sighted light-heavy sighted_68129 red 5 replication_2 other 11.0 0.0 1.0 -0.571429 -0.375 -0.833333 -1.0 -0.25
42976 28508 sighted relaxed-tense sighted_68129 red 5 replication_2 other 11.0 0.0 1.0 -0.571429 -0.375 -0.833333 -1.0 -0.25
42977 28509 sighted alive-dead sighted_68129 red 6 replication_2 other 11.0 0.0 1.0 -0.571429 -0.375 -0.833333 -1.0 -0.25
42978 28510 sighted fast-slow sighted_68129 red 3 replication_2 other 11.0 0.0 1.0 -0.571429 -0.375 -0.833333 -1.0 -0.25
42979 28511 sighted high-low sighted_68129 red 2 replication_2 other 11.0 0.0 1.0 -0.571429 -0.375 -0.833333 -1.0 -0.25

42980 rows × 16 columns

Add word frequency (Van Paridon & Thompson, 2020)¶

In [15]:
freqs = pd.read_csv('../datasets/dedup.en.words.unigrams.tsv', sep='\t')  # not included in git repo
freqs['log_freq'] = np.log(freqs['unigram_freq'])
freqs = freqs.drop(columns='unigram_freq')
display(freqs.round(2))
unigram log_freq
0 the 17.10
1 you 17.06
2 i 17.04
3 to 16.78
4 a 16.59
... ... ...
2397976 tpar1 0.00
2397977 giacoia 0.00
2397978 ourcinders 0.00
2397979 tourret 0.00
2397980 iroki 0.00

2397981 rows × 2 columns

In [16]:
df_joint['word1'] = df_joint['dimension'].apply(lambda x: x.split('-')[0])
df_joint['word2'] = df_joint['dimension'].apply(lambda x: x.split('-')[1])
df_joint = df_joint.merge(freqs, left_on='word1', right_on='unigram', how='left')
df_joint = df_joint.merge(freqs, left_on='word2', right_on='unigram', how='left')
df_joint['frequency'] = df_joint['log_freq_x'] - df_joint['log_freq_y']
df_joint = df_joint.drop(columns=[
    'unigram_x',
    'unigram_y',
    'log_freq_x',
    'log_freq_y'
])
display(df_joint.round(2))
index group dimension pp_id color rating experiment self_vs_other art fiction nonfiction reading_motivation reading_part_of_self reading_efficacy reading_recognition reading_other_realms word1 word2 frequency
0 0 sighted cold-hot sighted_1 white 1 original self NaN NaN NaN NaN NaN NaN NaN NaN cold hot -0.22
1 1 sighted ripe-unripe sighted_1 white 7 original self NaN NaN NaN NaN NaN NaN NaN NaN ripe unripe 3.49
2 2 sighted new-old sighted_1 white 1 original self NaN NaN NaN NaN NaN NaN NaN NaN new old 0.12
3 3 sighted submissive-aggressive sighted_1 white 1 original self NaN NaN NaN NaN NaN NaN NaN NaN submissive aggressive -2.35
4 4 sighted selfless-jealous sighted_1 white 1 original self NaN NaN NaN NaN NaN NaN NaN NaN selfless jealous -2.96
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
42975 28507 sighted light-heavy sighted_68129 red 5 replication_2 other 11.0 0.0 1.0 -0.57 -0.38 -0.83 -1.0 -0.25 light heavy 1.24
42976 28508 sighted relaxed-tense sighted_68129 red 5 replication_2 other 11.0 0.0 1.0 -0.57 -0.38 -0.83 -1.0 -0.25 relaxed tense -0.23
42977 28509 sighted alive-dead sighted_68129 red 6 replication_2 other 11.0 0.0 1.0 -0.57 -0.38 -0.83 -1.0 -0.25 alive dead -0.90
42978 28510 sighted fast-slow sighted_68129 red 3 replication_2 other 11.0 0.0 1.0 -0.57 -0.38 -0.83 -1.0 -0.25 fast slow 0.76
42979 28511 sighted high-low sighted_68129 red 2 replication_2 other 11.0 0.0 1.0 -0.57 -0.38 -0.83 -1.0 -0.25 high low 1.24

42980 rows × 19 columns

Add concreteness (Brysbaert et al., 2014)¶

In [17]:
concreteness = pd.read_csv('../datasets/en-brysbaert-2014.tsv', sep='\t')  # not included in git repo
display(concreteness)
word concreteness
0 a 1.46
1 aardvark 4.68
2 aback 1.65
3 abacus 4.52
4 abandon 2.54
... ... ...
37053 zoologist 4.30
37054 zoology 3.37
37055 zoom 3.10
37056 zoophobia 2.04
37057 zucchini 4.87

37058 rows × 2 columns

In [18]:
df_joint = df_joint.merge(concreteness, left_on='word1', right_on='word', how='left')
df_joint = df_joint.merge(concreteness, left_on='word2', right_on='word', how='left')
df_joint['concreteness'] = df_joint['concreteness_x'] - df_joint['concreteness_y']
df_joint = df_joint.drop(columns=[
    'word_x',
    'word_y',
    'concreteness_x',
    'concreteness_y'
])
display(df_joint.round(2))
index group dimension pp_id color rating experiment self_vs_other art fiction nonfiction reading_motivation reading_part_of_self reading_efficacy reading_recognition reading_other_realms word1 word2 frequency concreteness
0 0 sighted cold-hot sighted_1 white 1 original self NaN NaN NaN NaN NaN NaN NaN NaN cold hot -0.22 -0.46
1 1 sighted ripe-unripe sighted_1 white 7 original self NaN NaN NaN NaN NaN NaN NaN NaN ripe unripe 3.49 -0.01
2 2 sighted new-old sighted_1 white 1 original self NaN NaN NaN NaN NaN NaN NaN NaN new old 0.12 0.09
3 3 sighted submissive-aggressive sighted_1 white 1 original self NaN NaN NaN NaN NaN NaN NaN NaN submissive aggressive -2.35 -0.82
4 4 sighted selfless-jealous sighted_1 white 1 original self NaN NaN NaN NaN NaN NaN NaN NaN selfless jealous -2.96 -0.56
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
42975 28507 sighted light-heavy sighted_68129 red 5 replication_2 other 11.0 0.0 1.0 -0.57 -0.38 -0.83 -1.0 -0.25 light heavy 1.24 0.84
42976 28508 sighted relaxed-tense sighted_68129 red 5 replication_2 other 11.0 0.0 1.0 -0.57 -0.38 -0.83 -1.0 -0.25 relaxed tense -0.23 0.15
42977 28509 sighted alive-dead sighted_68129 red 6 replication_2 other 11.0 0.0 1.0 -0.57 -0.38 -0.83 -1.0 -0.25 alive dead -0.90 -0.93
42978 28510 sighted fast-slow sighted_68129 red 3 replication_2 other 11.0 0.0 1.0 -0.57 -0.38 -0.83 -1.0 -0.25 fast slow 0.76 0.04
42979 28511 sighted high-low sighted_68129 red 2 replication_2 other 11.0 0.0 1.0 -0.57 -0.38 -0.83 -1.0 -0.25 high low 1.24 0.12

42980 rows × 20 columns

Add Small World of Words associations (De Deyne et al., 2018)¶

In [19]:
swow = pd.read_csv('../datasets/SWOW-EN.R100.csv')  # not included in git repo
display(swow)
Unnamed: 0 id participantID age gender nativeLanguage country education created_at cue R1 R2 R3
0 1 29 3 33 Fe United States Australia NaN 2011-08-12 02:19:38 although nevertheless yet but
1 2 30 3 33 Fe United States Australia NaN 2011-08-12 02:19:38 deal no cards shake
2 3 31 3 33 Fe United States Australia NaN 2011-08-12 02:19:38 music notes band rhythm
3 4 32 3 33 Fe United States Australia NaN 2011-08-12 02:19:38 inform tell rat on NaN
4 5 33 3 33 Fe United States Australia NaN 2011-08-12 02:19:38 way path via method
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1228195 1228196 1530300 132506 29 Ma Canada Australia 5.0 2018-08-10 01:56:27 strange mask weird stranger
1228196 1228197 1530290 132506 29 Ma Canada Australia 5.0 2018-08-10 01:56:27 sunset sea sky clause
1228197 1228198 1530291 132506 29 Ma Canada Australia 5.0 2018-08-10 01:56:27 useless pitty worthless worth
1228198 1228199 1530284 132506 29 Ma Canada Australia 5.0 2018-08-10 01:56:27 volume loud music key
1228199 1228200 1530288 132506 29 Ma Canada Australia 5.0 2018-08-10 01:56:27 whenever who where always

1228200 rows × 13 columns

In [20]:
def add_swow(df, swow, colname):
    swow = pd.DataFrame(swow.groupby('cue')['resp'].value_counts()).rename(columns={'resp': 'n'})
    swow = swow.reset_index()
    df = df.merge(swow, left_on=['word1', 'color'], right_on=['cue', 'resp'], how='left')
    df = df.merge(swow, left_on=['word2', 'color'], right_on=['cue', 'resp'], how='left')
    df['n_x'] = df['n_x'].fillna(0)
    df['n_y'] = df['n_y'].fillna(0)
    df[colname] = df['n_x'] - df['n_y']
    df = df.drop(columns=[
        'cue_x',
        'cue_y',
        'resp_x',
        'resp_y',
        'n_x',
        'n_y',
    ])
    return df

swow = swow[swow['cue'].isin(dimensions)]
swow_NZ = swow[(swow['country'] == 'New Zealand')]  # select only NZ respondents
swow_US = swow[(swow['country'] == 'United States')]  # select only US respondents

# count only R1 (maximal discounting)
df_joint = add_swow(df_joint, swow.rename(columns={'R1': 'resp'}), 'swow_R1')
df_joint = add_swow(df_joint, swow_NZ.rename(columns={'R1': 'resp'}), 'swow_R1_NZ')  # US
df_joint = add_swow(df_joint, swow_US.rename(columns={'R1': 'resp'}), 'swow_R1_US')  # NZ

# count R1, R2, and R3 with equal weight (minimal discounting)
swow_all = swow.melt(
    id_vars=['id', 'participantID', 'created_at', 'cue'],
    value_vars=['R1', 'R2', 'R3'],
    value_name='resp',
)
df_joint = add_swow(df_joint, swow_all, 'swow_all')

# NZ
swow_all_NZ = swow_NZ.melt(
    id_vars=['id', 'participantID', 'created_at', 'cue'],
    value_vars=['R1', 'R2', 'R3'],
    value_name='resp',
)
df_joint = add_swow(df_joint, swow_all_NZ, 'swow_all_NZ')

# US
swow_all_US = swow_US.melt(
    id_vars=['id', 'participantID', 'created_at', 'cue'],
    value_vars=['R1', 'R2', 'R3'],
    value_name='resp',
)
df_joint = add_swow(df_joint, swow_all_US, 'swow_all_US')

display(df_joint)
index group dimension pp_id color rating experiment self_vs_other art fiction ... word1 word2 frequency concreteness swow_R1 swow_R1_NZ swow_R1_US swow_all swow_all_NZ swow_all_US
0 0 sighted cold-hot sighted_1 white 1 original self NaN NaN ... cold hot -0.216432 -0.46 0.0 0.0 0.0 0.0 0.0 0.0
1 1 sighted ripe-unripe sighted_1 white 7 original self NaN NaN ... ripe unripe 3.485549 -0.01 0.0 0.0 0.0 0.0 0.0 0.0
2 2 sighted new-old sighted_1 white 1 original self NaN NaN ... new old 0.119068 0.09 0.0 0.0 0.0 0.0 0.0 0.0
3 3 sighted submissive-aggressive sighted_1 white 1 original self NaN NaN ... submissive aggressive -2.352148 -0.82 0.0 0.0 0.0 0.0 0.0 0.0
4 4 sighted selfless-jealous sighted_1 white 1 original self NaN NaN ... selfless jealous -2.955968 -0.56 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
42975 28507 sighted light-heavy sighted_68129 red 5 replication_2 other 11.0 0.0 ... light heavy 1.240142 0.84 0.0 0.0 0.0 0.0 0.0 0.0
42976 28508 sighted relaxed-tense sighted_68129 red 5 replication_2 other 11.0 0.0 ... relaxed tense -0.229652 0.15 0.0 0.0 0.0 0.0 0.0 0.0
42977 28509 sighted alive-dead sighted_68129 red 6 replication_2 other 11.0 0.0 ... alive dead -0.904786 -0.93 -1.0 0.0 0.0 -1.0 0.0 0.0
42978 28510 sighted fast-slow sighted_68129 red 3 replication_2 other 11.0 0.0 ... fast slow 0.763262 0.04 0.0 0.0 0.0 0.0 0.0 0.0
42979 28511 sighted high-low sighted_68129 red 2 replication_2 other 11.0 0.0 ... high low 1.237676 0.12 0.0 0.0 0.0 0.0 0.0 0.0

42980 rows × 26 columns

In [21]:
# check how many participants gave green as a response to various cues (to use as an example in the paper)
counts = swow_all_US.groupby(['cue', 'resp']).count().reset_index()
display(counts[counts['resp'] == 'green'])
cue resp id participantID created_at variable
233 alive green 1 1 1 1
508 clean green 1 1 1 1
1108 exciting green 1 1 1 1
1289 fresh green 1 1 1 1
1456 hard green 1 1 1 1
1706 jealous green 20 20 20 20
1984 new green 1 1 1 1
3010 unripe green 18 18 18 18
In [22]:
display(df_joint.sort_values('swow_all'))
index group dimension pp_id color rating experiment self_vs_other art fiction ... word1 word2 frequency concreteness swow_R1 swow_R1_NZ swow_R1_US swow_all swow_all_NZ swow_all_US
25920 11452 sighted selfless-jealous sighted_68676 green 2 replication_2 self 4.0 0.0 ... selfless jealous -2.955968 -0.56 -19.0 0.0 -10.0 -40.0 -1.0 -20.0
5138 242 sighted selfless-jealous sighted_68736 green 2 replication_1 self NaN NaN ... selfless jealous -2.955968 -0.56 -19.0 0.0 -10.0 -40.0 -1.0 -20.0
37530 23062 sighted selfless-jealous sighted_67653 green 6 replication_2 other 10.0 4.0 ... selfless jealous -2.955968 -0.56 -19.0 0.0 -10.0 -40.0 -1.0 -20.0
25416 10948 sighted selfless-jealous sighted_69192 green 7 replication_2 self 9.0 1.0 ... selfless jealous -2.955968 -0.56 -19.0 0.0 -10.0 -40.0 -1.0 -20.0
16956 2488 sighted selfless-jealous sighted_68719 green 5 replication_2 self 3.0 0.0 ... selfless jealous -2.955968 -0.56 -19.0 0.0 -10.0 -40.0 -1.0 -20.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
466 466 blind clean-dirty blind_8 white 2 original self NaN NaN ... clean dirty 0.600633 -1.16 0.0 0.0 0.0 8.0 0.0 7.0
14129 9233 sighted clean-dirty sighted_68738 white 1 replication_1 other NaN NaN ... clean dirty 0.600633 -1.16 0.0 0.0 0.0 8.0 0.0 7.0
33057 18589 sighted light-heavy sighted_68150 white 2 replication_2 other 9.0 0.0 ... light heavy 1.240142 0.84 1.0 0.0 1.0 8.0 0.0 5.0
12790 7894 sighted clean-dirty sighted_68946 white 1 replication_1 other NaN NaN ... clean dirty 0.600633 -1.16 0.0 0.0 0.0 8.0 0.0 7.0
21537 7069 sighted light-heavy sighted_67884 white 2 replication_2 self 5.0 1.0 ... light heavy 1.240142 0.84 1.0 0.0 1.0 8.0 0.0 5.0

42980 rows × 26 columns

(It looks like there very few responses from NZ, but a little more from US and elsewhere.)

Add cosine distances (Mikolov et al., 2013)¶

In [2]:
def get_cosine(x, vecs_dict):
    zero = np.zeros(300)
    return np.dot(norm(vecs_dict.get(x['word2'], zero) - vecs_dict.get(x['word1'], zero)),
                  vecs_dict.get(x['color'], zero))

Common Crawl¶

In [25]:
vecs = Vectors('../embeddings/cc.en.300.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_cc'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/cc.en.300.vec
[INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.176 seconds
[INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.056 seconds

Subtitles¶

In [26]:
vecs = Vectors('../embeddings/subs.en.1e6.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_subs'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/subs.en.1e6.vec
[INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.236 seconds
[INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.043 seconds

COCA¶

In [27]:
# academic
vecs = Vectors('../embeddings/acad.en.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_acad'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)

# fiction
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)

# magazines
vecs = Vectors('../embeddings/mag.en.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_mag'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)

# spoken
vecs = Vectors('../embeddings/spok.en.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_spok'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)

# news
vecs = Vectors('../embeddings/news.en.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_news'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)

display(df_joint.round(2))
[INFO] loading vectors ../embeddings/acad.en.vec
[INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.205 seconds
[INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.044 seconds
[INFO] loading vectors ../embeddings/fic.en.vec
[INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.082 seconds
[INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.041 seconds
[INFO] loading vectors ../embeddings/mag.en.vec
[INFO] <function Vectors.__init__ at 0x16c552d40> ran in 5.965 seconds
[INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.040 seconds
[INFO] loading vectors ../embeddings/spok.en.vec
[INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.355 seconds
[INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.063 seconds
[INFO] loading vectors ../embeddings/news.en.vec
[INFO] <function Vectors.__init__ at 0x16c552d40> ran in 6.311 seconds
[INFO] <function Vectors.as_dict at 0x16c552f80> ran in 0.039 seconds
index group dimension pp_id color rating experiment self_vs_other art fiction ... swow_all swow_all_NZ swow_all_US cosine_cc cosine_subs cosine_acad cosine_fic cosine_mag cosine_spok cosine_news
0 0 sighted cold-hot sighted_1 white 1 original self NaN NaN ... 0.0 0.0 0.0 0.05 0.02 0.04 0.01 -0.06 0.03 -0.04
1 1 sighted ripe-unripe sighted_1 white 7 original self NaN NaN ... 0.0 0.0 0.0 0.02 0.09 0.03 0.16 0.07 -0.20 0.03
2 2 sighted new-old sighted_1 white 1 original self NaN NaN ... 0.0 0.0 0.0 0.12 0.04 0.10 0.07 0.04 0.08 0.03
3 3 sighted submissive-aggressive sighted_1 white 1 original self NaN NaN ... 0.0 0.0 0.0 -0.08 -0.08 -0.09 -0.01 -0.07 -0.05 -0.03
4 4 sighted selfless-jealous sighted_1 white 1 original self NaN NaN ... 0.0 0.0 0.0 0.05 -0.01 -0.01 -0.01 -0.01 0.11 0.01
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
42975 28507 sighted light-heavy sighted_68129 red 5 replication_2 other 11.0 0.0 ... 0.0 0.0 0.0 -0.16 -0.04 -0.06 -0.08 -0.16 -0.18 -0.08
42976 28508 sighted relaxed-tense sighted_68129 red 5 replication_2 other 11.0 0.0 ... 0.0 0.0 0.0 0.11 -0.04 0.09 -0.02 -0.01 0.04 0.06
42977 28509 sighted alive-dead sighted_68129 red 6 replication_2 other 11.0 0.0 ... -1.0 0.0 0.0 0.15 0.06 0.02 0.05 -0.00 0.05 0.10
42978 28510 sighted fast-slow sighted_68129 red 3 replication_2 other 11.0 0.0 ... 0.0 0.0 0.0 0.00 -0.02 -0.01 -0.00 -0.05 0.04 -0.03
42979 28511 sighted high-low sighted_68129 red 2 replication_2 other 11.0 0.0 ... 0.0 0.0 0.0 -0.01 0.04 0.01 -0.04 0.06 0.12 0.08

42980 rows × 33 columns

Filtered COCA¶

COCA embeddings, but from COCA corpora without sentences with 1st order cooccurrences (sentences with a color word and a dimension word).

In [3]:
df_joint = pd.read_csv('data/data_plus_predictors.tsv', sep='\t')
In [4]:
vecs = Vectors('../embeddings/fic_cut.en.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_small'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/fic_cut.en.vec
[INFO] <function Vectors.__init__ at 0x14f366170> ran in 4.590 seconds
[INFO] <function Vectors.as_dict at 0x14f3663b0> ran in 0.043 seconds
In [5]:
vecs = Vectors('../embeddings/fic_no_1st_order_cut.en.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_no_1st_order'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1)
[INFO] loading vectors ../embeddings/fic_no_1st_order_cut.en.vec
[INFO] <function Vectors.__init__ at 0x14f366170> ran in 5.076 seconds
[INFO] <function Vectors.as_dict at 0x14f3663b0> ran in 0.039 seconds

No neighbors COCA¶

COCA embeddings, but from training corpora from which the 25 nearest neighbors of each color and dimension word have been removed (in an attempt to disrupt the "scaffolding" that semantic associations with the colors and dimension words are built on).

We use two filtering regimes, a strong and a weak one. In the strong regime we remove every line that contains any neighbor word. In the weak regime we remove any of the following:

  1. Line that contains a color word and a neighbor of a dimension word.
  2. Line that contains a dimension word and a neighbor of a color word.
  3. Line that contains both a color word and a dimension word.
In [6]:
vecs = Vectors('../embeddings/fic_no_neighbors_strong_no1st.en.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_no_neighbors_strong'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1).fillna(0)

vecs = Vectors('../embeddings/fic_no_neighbors_weak_cut.en.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_no_neighbors_weak'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1).fillna(0)
[INFO] loading vectors ../embeddings/fic_no_neighbors_strong_no1st.en.vec
[INFO] <function Vectors.__init__ at 0x14f366170> ran in 5.011 seconds
[INFO] <function Vectors.as_dict at 0x14f3663b0> ran in 0.067 seconds
[INFO] loading vectors ../embeddings/fic_no_neighbors_weak_cut.en.vec
[INFO] <function Vectors.__init__ at 0x14f366170> ran in 4.467 seconds
[INFO] <function Vectors.as_dict at 0x14f3663b0> ran in 0.041 seconds

No names COCA¶

COCA embeddings, but from training corpora from which the labels generated by at least two participants for color-semantic associations (e.g. the label snow for the combination white and cold) has been removed. (These nameability data are explored in more detail in a section at the end of this notebook.)

In [7]:
# fiction
vecs = Vectors('../embeddings/fic_no_mediators_cut.en.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs_dict = vecs.as_dict()
df_joint['cosine_fic_no_mediators'] = df_joint.apply(lambda x: get_cosine(x, vecs_dict), axis=1).fillna(0)
[INFO] loading vectors ../embeddings/fic_no_mediators_cut.en.vec
[INFO] <function Vectors.__init__ at 0x14f366170> ran in 4.563 seconds
[INFO] <function Vectors.as_dict at 0x14f3663b0> ran in 0.037 seconds
/var/folders/8h/k__12s992nbc7rmfv_w1rptc0000gp/T/ipykernel_62683/653768335.py:57: RuntimeWarning: invalid value encountered in divide
  return x / np.linalg.norm(x, 2)
In [8]:
df_joint.to_csv('data/data_plus_predictors.tsv', sep='\t')

Correlations between predictors¶

Correlations between predictors in original data (NZ)¶

In [31]:
df_orig = df_joint[df_joint['experiment'] == 'original']

corrs = np.abs(df_joint[[
    'rating',
    'cosine_cc',
    'cosine_subs',
    'cosine_fic',
    'cosine_fic_no_1st_order',
    'cosine_fic_no_neighbors_weak',
    'cosine_fic_no_neighbors_strong',
    'cosine_fic_no_mediators',
    'swow_all',
    'swow_all_NZ',
    'swow_all_US',
    'frequency',
    'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
No description has been provided for this image

Correlations between predictors in 2nd replication data (US)¶

In [32]:
df_rep = df_joint[df_joint['experiment'] == 'replication_2']

corrs = np.abs(df_joint[[
    'rating',
    'cosine_cc',
    'cosine_subs',
    'cosine_fic',
    'cosine_fic_no_1st_order',
    'cosine_fic_no_neighbors_weak',
    'cosine_fic_no_neighbors_strong',
    'cosine_fic_no_mediators',
    'swow_all',
    'swow_all_NZ',
    'swow_all_US',
    'frequency',
    'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
No description has been provided for this image

Correlations between predictors in full dataset¶

In [33]:
corrs = np.abs(df_joint[[
    'rating',
    'cosine_cc',
    'cosine_subs',
    'cosine_fic',
    'cosine_fic_no_1st_order',
    'cosine_fic_no_neighbors_weak',
    'cosine_fic_no_neighbors_strong',
    'cosine_fic_no_mediators',
    'swow_all',
    'swow_all_NZ',
    'swow_all_US',
    'frequency',
    'concreteness',
]].corr()).round(2)
g = sns.heatmap(corrs, vmin=0, vmax=1, annot=True)
No description has been provided for this image

Standardize predictors and write to file¶

In [10]:
def standardize(Series):
    return (Series - Series.mean()) / Series.std()

df_joint['art_z'] = standardize(df_joint['art'])
df_joint['fiction_z'] = standardize(df_joint['fiction'])
df_joint['nonfiction_z'] = standardize(df_joint['nonfiction'])
df_joint['reading_motivation_z'] = standardize(df_joint['reading_motivation'])
df_joint['reading_part_of_self_z'] = standardize(df_joint['reading_part_of_self'])
df_joint['reading_efficacy_z'] = standardize(df_joint['reading_efficacy'])
df_joint['reading_recognition_z'] = standardize(df_joint['reading_recognition'])
df_joint['reading_other_realms_z'] = standardize(df_joint['reading_other_realms'])

df_joint['rating_z'] = standardize(df_joint['rating'])
df_joint['frequency_z'] = standardize(df_joint['frequency'])
df_joint['concreteness_z'] = standardize(df_joint['concreteness'])

df_joint['swow_all_z'] = standardize(df_joint['swow_all'])
df_joint['swow_all_NZ_z'] = standardize(df_joint['swow_all_NZ'])
df_joint['swow_all_US_z'] = standardize(df_joint['swow_all_US'])
df_joint['swow_R1_z'] = standardize(df_joint['swow_R1'])
df_joint['swow_R1_NZ_z'] = standardize(df_joint['swow_R1_NZ'])
df_joint['swow_R1_US_z'] = standardize(df_joint['swow_R1_US'])

df_joint['cosine_cc_z'] = standardize(df_joint['cosine_cc'])
df_joint['cosine_subs_z'] = standardize(df_joint['cosine_subs'])

df_joint['cosine_acad_z'] = standardize(df_joint['cosine_acad'])
df_joint['cosine_fic_z'] = standardize(df_joint['cosine_fic'])
df_joint['cosine_mag_z'] = standardize(df_joint['cosine_mag'])
df_joint['cosine_news_z'] = standardize(df_joint['cosine_news'])
df_joint['cosine_spok_z'] = standardize(df_joint['cosine_spok'])

df_joint['cosine_fic_small_z'] = standardize(df_joint['cosine_fic_small'])
df_joint['cosine_fic_no_1st_order_z'] = standardize(df_joint['cosine_fic_no_1st_order'])
df_joint['cosine_fic_no_neighbors_weak_z'] = standardize(df_joint['cosine_fic_no_neighbors_weak'])
df_joint['cosine_fic_no_neighbors_strong_z'] = standardize(df_joint['cosine_fic_no_neighbors_strong'])
df_joint['cosine_fic_no_mediators_z'] = standardize(df_joint['cosine_fic_no_mediators'])

df_joint['blind'] = pd.get_dummies(df_joint['group'])['blind']
df_joint['sighted'] = pd.get_dummies(df_joint['group'])['sighted']
df_joint['group_eff'] = (df_joint['sighted'] - .5) * 2
df_joint['group_z'] = standardize(df_joint['sighted'])

df_joint['original'] = pd.get_dummies(df_joint['experiment'])['original']
df_joint['replication_1'] = pd.get_dummies(df_joint['experiment'])['replication_1']
df_joint['replication_2'] = pd.get_dummies(df_joint['experiment'])['replication_2']

df_joint['other'] = pd.get_dummies(df_joint['self_vs_other'])['other']
df_joint['self'] = pd.get_dummies(df_joint['self_vs_other'])['self']
df_joint['self_vs_other_eff'] = (df_joint['other'] - .5) * 2
df_joint['self_vs_other_z'] = standardize(df_joint['other'])

df_joint.to_csv('data/data_plus_predictors.tsv', sep='\t', index=False)

display(df_joint)
index group dimension pp_id color rating experiment self_vs_other art fiction ... group_z original replication_1 replication_2 other self self_vs_other_eff self_vs_other_z cosine_fic_small cosine_fic_small_z
0 0 sighted cold-hot sighted_1 white 1 original self NaN NaN ... 0.211241 True False False False True -1.0 -0.891882 -0.031634 -0.475593
1 1 sighted ripe-unripe sighted_1 white 7 original self NaN NaN ... 0.211241 True False False False True -1.0 -0.891882 0.116445 1.327408
2 2 sighted new-old sighted_1 white 1 original self NaN NaN ... 0.211241 True False False False True -1.0 -0.891882 0.105395 1.192865
3 3 sighted submissive-aggressive sighted_1 white 1 original self NaN NaN ... 0.211241 True False False False True -1.0 -0.891882 -0.042038 -0.602275
4 4 sighted selfless-jealous sighted_1 white 1 original self NaN NaN ... 0.211241 True False False False True -1.0 -0.891882 -0.082589 -1.096013
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
42975 28507 sighted light-heavy sighted_68129 red 5 replication_2 other 11.0 0.0 ... 0.211241 False False True True False 1.0 1.121199 -0.092165 -1.212613
42976 28508 sighted relaxed-tense sighted_68129 red 5 replication_2 other 11.0 0.0 ... 0.211241 False False True True False 1.0 1.121199 0.070999 0.774055
42977 28509 sighted alive-dead sighted_68129 red 6 replication_2 other 11.0 0.0 ... 0.211241 False False True True False 1.0 1.121199 0.079640 0.879268
42978 28510 sighted fast-slow sighted_68129 red 3 replication_2 other 11.0 0.0 ... 0.211241 False False True True False 1.0 1.121199 0.056388 0.596159
42979 28511 sighted high-low sighted_68129 red 2 replication_2 other 11.0 0.0 ... 0.211241 False False True True False 1.0 1.121199 -0.064931 -0.881009

42980 rows × 78 columns

In [2]:
df_joint = pd.read_csv('data/data_plus_predictors.tsv', sep='\t')

Nameability of color-dimension associations¶

In [35]:
def get_cosine_1word(x, vecs_dict):
    zero = np.zeros(300)
    return np.dot(vecs_dict.get(x['dimension'], zero), vecs_dict.get(x['color'], zero))

# fiction
vecs = Vectors('../embeddings/fic.en.vec', n=2e5, d=300, normalize=True)  # not included in git repo
vecs_dict = vecs.as_dict()

df_names = pd.read_csv('data/color_dimension_nameability.csv')
display(df_names.head())
[INFO] loading vectors ../embeddings/fic.en.vec
[INFO] <function Vectors.__init__ at 0x14494eca0> ran in 6.137 seconds
[INFO] <function Vectors.as_dict at 0x14494eee0> ran in 0.044 seconds
prompt dimension color number_responses avg_words_per_response percent_unique_words percent_unique_lemmas simpson_diversity modal_agreement modal_names modal_response_agreement modal_response
0 happy_brown happy brown 10 1.000000 0.800000 0.800000 0.044444 0.200000 cat,puppy 0.200000 cat,puppy
1 unripe_brown unripe brown 10 1.000000 1.000000 1.000000 0.000000 0.100000 grape,bannana,kiwi,avocado,pear,fruit,tree,coc... 0.100000 grape,bannana,kiwi,avocado,pear,fruit,tree,coc...
2 hard_brown hard brown 10 1.000000 0.900000 0.800000 0.044444 0.200000 wood,rock 0.200000 wood
3 angry_blue angry blue 13 1.076923 0.714286 0.714286 0.054945 0.230769 shark 0.230769 shark
4 sad_brown sad brown 10 1.100000 0.909091 0.909091 0.018182 0.200000 cat 0.200000 cat
In [36]:
# check how many participants provided labels for each color-adjective pair
print(df_names['number_responses'].min())
print(df_names['number_responses'].max())
display(df_names.sort_values('modal_agreement'))
7
13
prompt dimension color number_responses avg_words_per_response percent_unique_words percent_unique_lemmas simpson_diversity modal_agreement modal_names modal_response_agreement modal_response
99 liked_blue liked blue 13 1.000000 1.000000 1.000000 0.000000 0.076923 sonic,sky,bird,pigeon,phone,smurfs,pencil,colo... 0.076923 sonic,sky,bird,pigeon,phone,smurfs,pencil,colo...
40 relaxed_blue relaxed blue 13 1.076923 1.000000 1.000000 0.000000 0.076923 smurfette,meditation,bird,water,tranquility,st... 0.076923 smurfette,meditation,bird,water,tranquility,st...
30 submissive_blue submissive blue 13 1.000000 1.000000 1.000000 0.000000 0.076923 macaw,nun,bird,swallow,butterfly,flower,door,b... 0.076923 macaw,nun,bird,swallow,butterfly,flowers,door,...
91 old_blue old blue 13 1.076923 1.000000 1.000000 0.000000 0.076923 bluecheese,necklace,bird,dress,shoe,smurfs,rug... 0.076923 bluecheese,necklace,bird,dress,shoes,smurfs,ru...
192 clean_yellow clean yellow 12 1.083333 1.000000 1.000000 0.000000 0.083333 table,detergant,sun,glove,hat,flag,ford,mustan... 0.083333 table,detergant,sun,gloves,hat,flag,ford.musta...
... ... ... ... ... ... ... ... ... ... ... ... ...
151 clean_white clean white 9 1.111111 0.700000 0.600000 0.222222 0.555556 sheet 0.333333 sheets
170 ripe_yellow ripe yellow 12 1.000000 0.500000 0.500000 0.318182 0.583333 banana 0.583333 banana
68 cold_blue cold blue 13 1.000000 0.461538 0.461538 0.358974 0.615385 ice 0.615385 ice
147 cold_white cold white 9 1.000000 0.333333 0.333333 0.583333 0.777778 snow 0.777778 snow
158 stale_white stale white 9 1.000000 0.222222 0.222222 0.777778 0.888889 bread 0.888889 bread

306 rows × 12 columns

Exporting names generated by participants for use in training corpus filtering¶

In [37]:
names = df_names['modal_names']
names = list(chain(*[name.split(',') for name in names]))
names_all = set(names)  # all unique names
names_count = Counter(names)
names_2plus = [name[0] for name in names_count.most_common() if name[1] >= 2]  # all names that occur 2+ times
print(f'Number of labels named by at least 2 participants: {len(names_2plus)}')
with open('data/pair_labels_all.txt', 'w') as namesfile:
    namesfile.write('\n'.join(names_all))
with open('data/pair_labels_2plus.txt', 'w') as namesfile:
    namesfile.writelines('\n'.join(names_2plus))
# let's ignore words like "me", "my", and "a" though
Number of labels named by at least 2 participants: 242

Correlating COCA-fiction cosine similarities to nameability measures¶

Since we only have nameability for colors and dimension axis poles (i.e. for yellow and dislike but not yellow and dislike-like), we correlate nameability measures with cosine similarity between color and dimension axis pole.

In [38]:
pearsonr(df_names['simpson_diversity'], df_names['modal_agreement'])
Out[38]:
PearsonRResult(statistic=0.8947743710654124, pvalue=1.816739746708339e-108)
In [39]:
df_names['cosine_fic'] = df_names.apply(lambda x: get_cosine_1word(x, vecs_dict), axis=1)
display(df_names.head())

x = pearsonr(df_names['cosine_fic'], df_names['simpson_diversity'])
print(f'pearsonr(cosine_fiction, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_names['cosine_fic'], df_names['modal_agreement'])
print(f'pearsonr(cosine_fiction, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
g = sns.lmplot(x='cosine_fic', y='simpson_diversity', data=df_names)
g = sns.lmplot(x='cosine_fic', y='modal_agreement', data=df_names)
prompt dimension color number_responses avg_words_per_response percent_unique_words percent_unique_lemmas simpson_diversity modal_agreement modal_names modal_response_agreement modal_response cosine_fic
0 happy_brown happy brown 10 1.000000 0.800000 0.800000 0.044444 0.200000 cat,puppy 0.200000 cat,puppy 0.142680
1 unripe_brown unripe brown 10 1.000000 1.000000 1.000000 0.000000 0.100000 grape,bannana,kiwi,avocado,pear,fruit,tree,coc... 0.100000 grape,bannana,kiwi,avocado,pear,fruit,tree,coc... 0.326845
2 hard_brown hard brown 10 1.000000 0.900000 0.800000 0.044444 0.200000 wood,rock 0.200000 wood 0.193040
3 angry_blue angry blue 13 1.076923 0.714286 0.714286 0.054945 0.230769 shark 0.230769 shark 0.160328
4 sad_brown sad brown 10 1.100000 0.909091 0.909091 0.018182 0.200000 cat 0.200000 cat 0.274516
pearsonr(cosine_fiction, simpson_diversity): 0.185, p-value: 0.001
pearsonr(cosine_fiction, modal_agreement): 0.203, p-value: 0.000
No description has been provided for this image
No description has been provided for this image

Correlating group-averaged human ratings to nameability measure differentials.¶

Since we do not have human ratings for the association between colors and dimension axis poles (only for association between colors and dimension axes), we need to collapse our nameability measures for the two poles of each dimension axis. One way to do this is to compute difference scores.

In [40]:
df_sighted = df_joint.loc[df_joint['group'] == 'sighted']
df_sighted['diversity_word1'] = df_sighted.merge(df_names, how='left', left_on=['word1', 'color'], right_on=['dimension', 'color'])['simpson_diversity']
df_sighted['diversity_word2'] = df_sighted.merge(df_names, how='left', left_on=['word2', 'color'], right_on=['dimension', 'color'])['simpson_diversity']
df_sighted['agreement_word1'] = df_sighted.merge(df_names, how='left', left_on=['word1', 'color'], right_on=['dimension', 'color'])['modal_agreement']
df_sighted['agreement_word2'] = df_sighted.merge(df_names, how='left', left_on=['word2', 'color'], right_on=['dimension', 'color'])['modal_agreement']
df_sighted['diff_diversity'] = (df_sighted['diversity_word1'] - df_sighted['diversity_word2'])
df_sighted['diff_agreement'] = (df_sighted['agreement_word1'] - df_sighted['agreement_word2'])

df_sighted = df_sighted.dropna()
display(df_sighted.head())
df_mean_sighted = df_sighted.groupby(['color', 'dimension', 'word1', 'word2']).mean().reset_index()
df_sd_sighted = df_sighted.groupby(['color', 'dimension', 'word1', 'word2']).std().reset_index()
x = pearsonr(df_mean_sighted['rating'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating, simpson_diversity_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['rating'], df_mean_sighted['diff_agreement'])
print(f'pearsonr(rating, modal_agreement_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['cosine_fic'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(cosine_fiction, simpson_diversity_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['cosine_fic'], df_mean_sighted['diff_agreement'])
print(f'pearsonr(cosine_fiction, modal_agreement_difference): {x[0]:.3f}, p-value: {x[1]:.3f}')
index group dimension pp_id color rating experiment self_vs_other art fiction ... other self self_vs_other_eff self_vs_other_z diversity_word1 diversity_word2 agreement_word1 agreement_word2 diff_diversity diff_agreement
14468 0 sighted cold-hot sighted_69212 brown 4 replication_2 self 3.0 0.0 ... 0 1 -1.0 -0.891882 0.000000 0.012821 0.142857 0.285714 -0.012821 -0.142857
14469 1 sighted ripe-unripe sighted_69212 brown 7 replication_2 self 3.0 0.0 ... 0 1 -1.0 -0.891882 0.238095 0.035714 0.428571 0.285714 0.202381 0.142857
14470 2 sighted new-old sighted_69212 brown 6 replication_2 self 3.0 0.0 ... 0 1 -1.0 -0.891882 0.000000 0.000000 0.142857 0.142857 0.000000 0.000000
14471 3 sighted submissive-aggressive sighted_69212 brown 2 replication_2 self 3.0 0.0 ... 0 1 -1.0 -0.891882 0.000000 0.000000 0.142857 0.142857 0.000000 0.000000
14472 4 sighted selfless-jealous sighted_69212 brown 5 replication_2 self 3.0 0.0 ... 0 1 -1.0 -0.891882 0.000000 0.044444 0.142857 0.285714 -0.044444 -0.142857

5 rows × 82 columns

pearsonr(rating, simpson_diversity_difference): 0.036, p-value: 0.666
pearsonr(rating, modal_agreement_difference): -0.012, p-value: 0.890
pearsonr(cosine_fiction, simpson_diversity_difference): 0.091, p-value: 0.278
pearsonr(cosine_fiction, modal_agreement_difference): 0.032, p-value: 0.707
In [41]:
df_mean_sighted['rating_sd'] = df_sd_sighted['rating']
g = sns.lmplot(x='rating_sd', y='diff_diversity', data=df_mean_sighted)
x = pearsonr(df_mean_sighted['rating_sd'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_sighted['rating_sd'], df_mean_sighted['diff_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
pearsonr(rating_sd, simpson_diversity): 0.280, p-value: 0.001
pearsonr(rating_sd, simpson_diversity): 0.280, p-value: 0.001
No description has been provided for this image

Correlation of group-averaged split-inverse ratings with nameability measures¶

One other way to work around the issue of having only color to dimension axis pole nameability is to split and invert the human ratings of color-dimension axis associations to create two scores per rating: One for the right end of the axis (equal to the rating), and one for the left end of the axis (equal to eight minus the rating). For example: If yellow is assigned a 6 on the scale dislike-like, the rating for yellow/like is 6, but we also create a rating of 2 for yellow/dislike.

In [42]:
df_inverse = df_sighted[[
    'color',
    'word1',
    'rating',
    'diversity_word1',
    'agreement_word1'
]].rename(columns={
    'word1': 'dimension',
    'diversity_word1': 'simpson_diversity',
    'agreement_word1': 'modal_agreement'
})

df_inverse['rating'] = 8 - df_inverse['rating']

df_inverse = pd.concat([df_inverse, df_sighted[[
'color',
    'word2',
    'rating',
    'diversity_word2',
    'agreement_word2'
]].rename(columns={
    'word2': 'dimension',
    'diversity_word2': 'simpson_diversity',
    'agreement_word2': 'modal_agreement'
})])
display(df_inverse)

df_mean_inverse = df_inverse.groupby(['color', 'dimension']).mean().reset_index()
df_sd_inverse = df_inverse.groupby(['color', 'dimension']).std().reset_index()

x = pearsonr(df_mean_inverse['rating'], df_mean_inverse['simpson_diversity'])
print(f'pearsonr(rating, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_inverse['rating'], df_mean_inverse['modal_agreement'])
print(f'pearsonr(rating, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
color dimension rating simpson_diversity modal_agreement
14468 brown cold 4 0.000000 0.142857
14469 brown ripe 1 0.238095 0.428571
14470 brown new 2 0.000000 0.142857
14471 brown submissive 6 0.000000 0.142857
14472 brown selfless 3 0.000000 0.142857
... ... ... ... ... ...
41138 yellow hard 2 0.000000 0.125000
41139 yellow heavy 2 0.000000 0.125000
41140 yellow tense 2 0.000000 0.125000
41141 yellow dead 2 0.000000 0.125000
41142 yellow slow 2 0.000000 0.125000

46272 rows × 5 columns

pearsonr(rating, simpson_diversity): 0.062, p-value: 0.293
pearsonr(rating, modal_agreement): 0.070, p-value: 0.237
In [43]:
df_mean_inverse['rating_sd'] = df_sd_inverse['rating']
g = sns.lmplot(x='rating_sd', y='modal_agreement', data=df_mean_inverse)
g = sns.lmplot(x='rating_sd', y='simpson_diversity', data=df_mean_inverse)
x = pearsonr(df_mean_inverse['rating_sd'], df_mean_inverse['simpson_diversity'])
print(f'pearsonr(rating_sd, simpson_diversity): {x[0]:.3f}, p-value: {x[1]:.3f}')
x = pearsonr(df_mean_inverse['rating_sd'], df_mean_inverse['modal_agreement'])
print(f'pearsonr(rating_sd, modal_agreement): {x[0]:.3f}, p-value: {x[1]:.3f}')
pearsonr(rating_sd, simpson_diversity): 0.228, p-value: 0.000
pearsonr(rating_sd, modal_agreement): 0.228, p-value: 0.000
No description has been provided for this image
No description has been provided for this image

In short: nameability (measured as simpson diversity and name agreement for the modal name) is weakly correlated with cosine similarity between colors and dimension axis poles, but not with human ratings, regardless of whether we fit the nameability to the ratings (by computing difference scores for the nameability measures) or fit the ratings to the nameability (by computing inverse ratings for the left poles of the dimension axes).

More figures¶

Mean color ratings on each dimension¶

In [ ]:
df_viz = df_joint[df_joint['dimension'] != 'high-low']

df_means = df_viz.groupby(['dimension', 'color', 'word1', 'word2']).mean().reset_index()
dim_order = df_means.groupby('dimension').std().sort_values('rating', ascending=False).reset_index()['dimension']
df_means = df_means.set_index('dimension').loc[dim_order].reset_index()

mins_idx = df_means.groupby(['dimension'])['rating'].transform(min) == df_means['rating']
mins = df_means[mins_idx]
maxs_idx = df_means.groupby(['dimension'])['rating'].transform(max) == df_means['rating']
maxs = df_means[maxs_idx]

df_mins = mins[['word2', 'dimension', 'color']].merge(df_viz[['word2', 'dimension', 'color', 'rating']], how='left', on=['dimension', 'color', 'word2'])
df_maxs = maxs[['word1', 'dimension', 'color']].merge(df_viz[['word1', 'dimension', 'color', 'rating']], how='left', on=['dimension', 'color', 'word1'])

display(df_mins)
display(df_maxs)
In [ ]:
sns.set_style('darkgrid')
all_colors = {color: color for color in df_viz['color']}

fig, ax1 = plt.subplots(figsize=(3, 8))
sns.pointplot(data=df_viz, y='word1', x='rating', hue='color',
               palette=all_colors, join=False, dodge=False, ax=ax1, errorbar=('ci', .95))

ax2 = ax1.twinx()
sns.pointplot(data=df_viz, y='word2', x='rating', hue='color',
               palette=all_colors, join=False, dodge=False, ax=ax2, errorbar=('ci', .95))

ax1.set(ylabel='')
ax2.set(ylabel='')
ax1.get_legend().remove()
ax2.get_legend().remove()
ax1.set(xlim=[1, 7], xticks=[1, 2, 3, 4, 5, 6, 7]);
In [ ]:
sns.set_style('whitegrid')
mins_colors = {color: color for color in mins['color']}
maxs_colors = {color: color for color in maxs['color']}

fig, ax1 = plt.subplots(figsize=(3, 7))
sns.violinplot(data=df_maxs, y='word1', x='rating', hue='color', #scale='width',
               palette=maxs_colors, dodge=False, ax=ax1, inner=None, cut=0)

ax2 = ax1.twinx()
sns.violinplot(data=df_mins, y='word2', x='rating', hue='color', #scale='area',
               palette=mins_colors, dodge=False, ax=ax2, inner=None, cut=0)

plt.setp(ax1.collections, alpha=.8)
plt.setp(ax2.collections, alpha=.8)
ax1.set(ylabel='')
ax2.set(ylabel='')
ax1.get_legend().remove()
ax2.get_legend().remove()
ax1.set(xlim=[1, 7], xticks=[1, 2, 3, 4, 5, 6, 7])
plt.savefig('figures/color_ratings.pdf', bbox_inches='tight')

Scatterplot with connected points¶

In [ ]:
sns.set_style('darkgrid')
df_blind = df_viz[df_viz['group'] == 'blind'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_sighted = df_viz[df_viz['group'] == 'sighted'].groupby(['group', 'dimension', 'color', 'word1', 'word2']).mean().reset_index()
df_scatter = pd.concat([df_blind, df_sighted])
df_scatter['colordim'] = df_scatter['color'] + '_' + df_scatter['dimension']
df_scatter = df_scatter.sort_values('cosine_fic_z')
means_colors = {row['color']: row['color'] for _, row in df_scatter.iterrows()}
g = sns.FacetGrid(df_scatter, hue='color', col='group',  height=5, palette=means_colors, aspect=.5, sharex=True)
g.map(plt.scatter, 'cosine_fic_z', 'rating', s=10)
g.map(sns.regplot, 'cosine_fic_z', 'rating', scatter=False, ci=False)#, linewidth=.5)

g.set(xlabel='COCA-fiction\nembedding projection')
g.axes[0][0].set(ylabel='mean participant rating')
g.axes[0][0].set(title='blind')
g.axes[0][1].set(title='sighted')
g.set(ylim=[.75, 7.25], xlim=[-2.9, 2.9])
plt.savefig('figures/scatter_color.pdf', bbox_inches='tight')
In [ ]:
df_sighted_mean = df_sighted.groupby(['dimension', 'color']).mean().reset_index()
df_blind_mean = df_blind.groupby(['dimension', 'color']).mean().reset_index()
df_ratings = df_sighted_mean[['dimension', 'color', 'rating']].merge(
    df_blind_mean[['dimension', 'color', 'rating']], on=['dimension', 'color'], how='left'
).rename(columns={'rating_x': 'rating_sighted', 'rating_y': 'rating_blind'})

fig, ax = plt.subplots(figsize=(5, 5))

sns.scatterplot(
    x='rating_sighted',
    y='rating_blind',
    hue='color',
    palette=all_colors,
    legend=False,
    ax=ax,
    data=df_ratings
)
ax.set(ylabel='mean blind association rating', xlabel='mean sighted association rating',
       ylim=[1, 7], xlim=[1, 7]);

def annotate(df, color, dimension, x=0, y=0):
    plt.text(df.loc[(df['color'] == color) & (df['dimension'] == dimension), 'rating_sighted'].values[0] + x,
             df.loc[(df['color'] == color) & (df['dimension'] == dimension), 'rating_blind'].values[0] + y,
             f'{color} on {dimension}', fontdict={'size': 'small'})

annotate(df_ratings, 'white', 'clean-dirty', -1.95, -.05)
annotate(df_ratings, 'blue', 'cold-hot', .05, -.20)
annotate(df_ratings, 'red', 'cold-hot', .1, -.05)
annotate(df_ratings, 'orange', 'cold-hot', .05, +.05)
annotate(df_ratings, 'black', 'cold-hot', -1.75, -.05)
annotate(df_ratings, 'red', 'relaxed-tense', .1, -.05)

#df_ratings.apply(lambda row: annotate(df_ratings, row['color'], row['dimension']), axis=1)

plt.savefig('figures/blind_vs_sighted_scatter.pdf', bbox_inches='tight')
In [68]:
df_viz = df_joint[df_joint['dimension'] != 'high-low']
#df_viz = df_viz[df_viz.group == 'sighted']
all_colors = {color: color for color in df_viz['color']}
sns.set(style='darkgrid')
In [78]:
fig, axes = plt.subplots(3, 6, sharex=True, sharey=True)
axes[2, 5].set_axis_off()
df_panels = df_viz[['dimension', 'color', 'rating', 'cosine_fic_z']].groupby(['dimension', 'color']).mean().reset_index()
for i, dimension in enumerate(df_panels.dimension.unique()):
    sns.scatterplot(
        x='rating',
        y='cosine_fic_z',
        hue='color',
        palette=all_colors,
        data=df_panels[df_panels.dimension == dimension],
        ax=axes[i // 6, i % 6],
        legend=False
    )
    """
    sns.regplot(
        x='rating',
        y='cosine_fic_z',
        scatter=False,
        color='gray',
        data=df_viz[df_viz.dimension == dimension],
        ax=axes[i // 6, i % 6]
    )
    """
    axes[i // 6, i % 6].set(
        title=dimension,
        xticks=range(2, 7),
        yticks=range(-2, 3),
        xlabel=None,
        ylabel=None
    )
    
axes[1, 0].set(ylabel='COCA-fiction embedding projection')
axes[2, 2].set(xlabel='mean participant rating')
plt.savefig('figures/rating_vs_cosine_scatter.pdf', bbox_inches='tight')
No description has been provided for this image

Convert notebook to html¶

In [2]:
convert_notebook('data_prep')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 convert_notebook('data_prep')

NameError: name 'convert_notebook' is not defined
In [ ]: